# Computations
import pandas as pd
import numpy as np
# preprocessing
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import re
# Visualisation libraries
## Progress Bar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze and predict customer churn for Telco Customer Churn data.
| Columns | Description |
|---|---|
| customerID | Customer ID |
| gender | Whether the customer is a male or a female |
| SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) |
| Partner | Whether the customer has a partner or not (Yes, No) |
| Dependents | Whether the customer has dependents or not (Yes, No) |
| tenure | Number of months the customer has stayed with the company |
| PhoneService | Whether the customer has a phone service or not (Yes, No) |
| MultipleLines | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| InternetService | Customer’s internet service provider (DSL, Fiber optic, No) |
| OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) |
| OnlineBackup | Whether the customer has an online backup or not (Yes, No, No internet service) |
| DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) |
| TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) |
| StreamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) |
| StreamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) |
| Contract | The contract term of the customer (Month-to-month, One year, Two years) |
| PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) |
| PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)) |
| MonthlyCharges | The amount charged to the customer monthly |
| TotalCharges | The total amount charged to the customer |
| Churn | Whether the customer churned or not (Yes or No) |
Path = 'telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv'
Data = pd.read_csv(Path)
Target = 'Churn'
Labels = ['No', 'Yes']
display(Data.head(10).style.hide_index().set_precision(2))
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| 9305-CDSKC | Female | 0 | No | No | 8 | Yes | Yes | Fiber optic | No | No | Yes | No | Yes | Yes | Month-to-month | Yes | Electronic check | 99.65 | 820.5 | Yes |
| 1452-KIOVK | Male | 0 | No | Yes | 22 | Yes | Yes | Fiber optic | No | Yes | No | No | Yes | No | Month-to-month | Yes | Credit card (automatic) | 89.10 | 1949.4 | No |
| 6713-OKOMC | Female | 0 | No | No | 10 | No | No phone service | DSL | Yes | No | No | No | No | No | Month-to-month | No | Mailed check | 29.75 | 301.9 | No |
| 7892-POOKP | Female | 0 | Yes | No | 28 | Yes | Yes | Fiber optic | No | No | Yes | Yes | Yes | Yes | Month-to-month | Yes | Electronic check | 104.80 | 3046.05 | Yes |
| 6388-TABGU | Male | 0 | No | Yes | 62 | Yes | No | DSL | Yes | Yes | No | No | No | No | One year | No | Bank transfer (automatic) | 56.15 | 3487.95 | No |
def text_sep(txt): return re.sub(r"(\w)([A-Z])", r"\1 \2", txt)
Data.rename(columns = {'gender':'Gender', 'tenure':'Tenure', 'customer ID': 'Customer ID'}, inplace = True)
Data.columns = [text_sep(txt) for txt in Data.columns.tolist()]
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
def dtypes_group(Inp, Dict = False):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out.index.name = 'Data Type'
Out = Out.reset_index(drop = False)
Out['Data Type'] = Out['Data Type'].astype(str)
if Dict:
Out = dict(zip(Out['Data Type'], Out['Features']))
return Out
def List_Print(Text, List, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + '%s:' % Text + Style.RESET_ALL + ' %s' % ', '.join(List))
data_info = Data_Plot(Data, Title = 'Telco Customer Churn')
dType = dtypes_group(Data, Dict = True)
List_Print(Text = 'Integer Columns', List = dType['int64'])
#
Data[dType['int64']] = Data[dType['int64']].astype('int32')
display(Data[dType['int64']].head(5).style.hide_index().set_precision(2))
Integer Columns: Senior Citizen, Tenure
| Senior Citizen | Tenure |
|---|---|
| 0 | 1 |
| 0 | 34 |
| 0 | 2 |
| 0 | 45 |
| 0 | 2 |
dType['float64'].append('Total Charges')
dType['object'].remove('Total Charges')
List_Print(Text = 'Float Columns', List = dType['float64'], C = 'Red')
#
def myfun(x):
return re.sub(r"\s+$", "", x, flags=re.UNICODE)
Data['Total Charges'] = pd.to_numeric(Data['Total Charges'].map(lambda x: myfun(x)))
del myfun
# Imputing Missing Values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Temp = imp.fit_transform(Data['Total Charges'].values.reshape(-1,1))
Data['Total Charges'] = Temp.ravel()
Data[dType['float64']] = Data[dType['float64']].astype(float)
display(Data[dType['float64']].head(5).style.hide_index().set_precision(2))
Float Columns: Monthly Charges, Total Charges
| Monthly Charges | Total Charges |
|---|---|
| 29.85 | 29.85 |
| 56.95 | 1889.50 |
| 53.85 | 108.15 |
| 42.30 | 1840.75 |
| 70.70 | 151.65 |
List_Print(Text = 'Object Columns', List = dType['object'], C = 'Green')
Object Columns: customer ID, Payment Method, Paperless Billing, Contract, Streaming Movies, Streaming TV, Tech Support, Device Protection, Online Backup, Online Security, Internet Service, Multiple Lines, Phone Service, Dependents, Partner, Gender, Churn
We can convert all Yes/No columns using as follows
\begin{cases} 0 &\mbox{No}\\ 1 &\mbox{Yes}\end{cases}Temp = []
for c in dType['object']:
if set(Data[c].unique().tolist()) == {'No', 'Yes'}:
Temp.append(c)
List_Print(Text = 'Yes/No Columns', List = Temp, C = 'Yellow', T = 'Black')
Data[Temp] = Data[Temp].replace({'Yes':1, 'No':0}).astype('int32')
display(Data[Temp].head(5).style.hide_index().set_precision(2))
del Temp
Yes/No Columns: Paperless Billing, Phone Service, Dependents, Partner, Churn
| Paperless Billing | Phone Service | Dependents | Partner | Churn |
|---|---|---|---|---|
| 1 | 0 | 0 | 1 | 0 |
| 0 | 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 1 |
| 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 1 |
Some other columns can be converted similarly; however, we need to create a new feature.
Temp = []
for c in dType['object']:
if set(Data[c].unique().tolist()) == {'No', 'No internet service', 'Yes'}:
Temp.append(c)
List_Print(Text = 'Columns', List = Temp, C = 'Black', T = 'Cyan')
Columns: Streaming Movies, Streaming TV, Tech Support, Device Protection, Online Backup, Online Security
These Columns can be coded as follows
$$\mbox{InternetServiceType} = \begin{cases} 0 &\mbox{No internet service} \\ 1 &\mbox{No}\\ 2 &\mbox{Yes}\end{cases}$$def myfun(x):
if x == 'No internet service':
return 0
elif x == 'No':
return 1
else:
return 2
Data[Temp] = Data[Temp].applymap(lambda x: myfun(x)).astype('int32')
display(Data[Temp].head(5).style.hide_index().set_precision(2))
del myfun, Temp
| Streaming Movies | Streaming TV | Tech Support | Device Protection | Online Backup | Online Security |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 1 | 2 | 1 |
| 1 | 1 | 1 | 2 | 1 | 2 |
| 1 | 1 | 1 | 1 | 2 | 2 |
| 1 | 1 | 2 | 2 | 1 | 2 |
| 1 | 1 | 1 | 1 | 1 | 1 |
Moreover, note that,
List_Print(Text = 'Internet Service', List = Data['Internet Service'].unique(), C = 'Black', T = 'White')
Internet Service: DSL, Fiber optic, No
This Column can be coded as follows
$$\mbox{InternetServiceType} = \begin{cases} 0 &\mbox{No} \\ 1 &\mbox{DSL}\\ 2 &\mbox{Fiber optic}\end{cases}$$def myfun(x):
if x == 'No':
return 0
elif x == 'DSL':
return 1
else:
return 2
Data['Internet Service'] = Data['Internet Service'].apply(lambda x: myfun(x)).astype('int32')
del myfun
Since, there is already a feature as Phone Service, for Multiple Lines, we can try $$ \mbox{MultipleLines} = \begin{cases} 0 &\mbox{No, No phone service}\\ 1 &\mbox{Yes}\end{cases} $$
Data['Multiple Lines'] = Data['Multiple Lines'].map(lambda x: 1 if x =='Yes' else 0).astype('int32')
dType = dtypes_group(Data, Dict = True)
List_Print(Text = 'Remaining Columns', List = dType[ 'object'], C = 'Red', T = 'White')
Remaining Columns: Contract, Gender, Payment Method, customer ID
List_Print(Text = 'Contract', List = Data['Contract'].unique(), C = 'Black', T = 'White')
Contract: Month-to-month, One year, Two year
Data['Contract'] = Data['Contract'].replace({'Month-to-month':0, 'One year':1, 'Two year':2}).astype('int32')
List_Print(Text = 'Gender', List = Data['Gender'].unique(), C = 'Black', T = 'White')
Gender: Female, Male
Data['Gender'] = Data['Gender'].map(lambda x: 1 if x =='Male' else 0).astype('int32')
List_Print(Text = 'Payment Method', List = Data['Payment Method'].unique(), C = 'Black', T = 'White')
Payment Method: Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)
In this case, we can not rank these values. Therefore,
Data = Data.join(pd.get_dummies(Data['Payment Method']).astype('int32'))
Data = Data.drop(columns = ['Payment Method'])
Data_types = dtypes_group(Data)
display(Data_types)
| Data Type | Features | Count | |
|---|---|---|---|
| 0 | int32 | [Device Protection, Credit card (automatic), B... | 21 |
| 1 | float64 | [Monthly Charges, Total Charges] | 2 |
| 2 | object | [customer ID] | 1 |
_ = Data_Plot(Data, Title = 'Telco Customer Churn')
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance.
def Feature_Normalize(X, PD):
def List_Break(mylist, n = PD['word_break']):
Out = []
for x in mylist:
y = x.split()
if len(y)> n:
z = ' '.join(y[:n])
sep = np.arange(0, len(y), n)[1:]
for n in sep:
z = z + '\n'+ ' '.join(y[n:])
else:
z = ' '.join(y)
Out.append(z)
return Out
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns = X.columns)
fig, ax = plt.subplots(2, 1, figsize = PD['figsize'])
ax = ax.ravel()
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", X.shape[1])]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=10, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": PD['annot_text_size']},
cbar_kws=kws)
if not PD['word_break'] == None:
mylist = List_Break(Temp.T.index.tolist())
_ = ax[i].xaxis.set_ticklabels(mylist)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], weight='bold', fontsize = 14)
_ = ax[i].set_aspect(1)
del Temp
plt.subplots_adjust(hspace=PD['hspace'])
Out = pd.DataFrame(data = X_std, columns = X.columns.tolist())
return Out
X = Data.drop(columns = [Target, 'customer ID'])
PD = dict(figsize = (18, 7), hspace = 0.2, annot_text_size = 8, word_break = 2)
X = Feature_Normalize( Data.drop(columns = [Target, 'customer ID']), PD)
# Modifying dataset
Data[X.columns.tolist()] = X
def Feature_Corr(df, PD, Target = Target):
def List_Break(mylist, n = PD['word_break']):
Out = []
for x in mylist:
y = x.split()
if len(y)> n:
z = ' '.join(y[:n])
sep = np.arange(0, len(y), n)[1:]
for n in sep:
z = z + '\n'+ ' '.join(y[n:])
else:
z = ' '.join(y)
Out.append(z)
return Out
fig, ax = plt.subplots(1, 1, figsize = PD['figsize'])
CM = df.corr().round(2)
CM = CM.loc[(CM.index == Target)].drop(columns = Target).T.sort_values(by = Target).T
kws = dict(label='Feature\nVariance', aspect=10, shrink= .3)
_ = sns.heatmap(CM, ax=ax, annot=True, square=True, cmap = PD['Colormap'], linewidths = 0.8, vmin=0,
vmax=CM.max(axis =1)[0], annot_kws={"size": PD['annot_text_size']}, cbar_kws=kws)
if not PD['word_break'] == None:
mylist = List_Break(CM.T.index.tolist())
_ = ax.xaxis.set_ticklabels(mylist)
_ = ax.set_yticklabels('')
_ = ax.set_aspect(1)
PD.update(Colormap = 'Greens')
Feature_Corr(Data, PD)
Data.to_csv (Path.split(".")[0]+'_STD.csv', index = None, header=True)